library(tidyverse)
library(visdat)
library(naniar)
library(broom)
library(labelled)
library(gridExtra)
data <- read_csv("Melbourne_housing_FULL.csv")
house <- data%>%select(Price,Rooms,Type,Distance,Bedroom2,Bathroom)

Handle missing value

vis_dat(house, palette = "cb_safe")

vis_miss(house, sort_miss=TRUE) + theme(aspect.ratio=1)

s_miss <- miss_summary(house)
s_miss$miss_df_prop
## [1] 0.1150128
s_miss$miss_var_summary
## [[1]]
## # A tibble: 6 x 4
##   variable n_miss     pct_miss n_miss_cumsum
##      <chr>  <int>        <dbl>         <int>
## 1    Price   7610 21.832056689          7610
## 2    Rooms      0  0.000000000          7610
## 3     Type      0  0.000000000          7610
## 4 Distance      1  0.002868864          7611
## 5 Bedroom2   8217 23.573457268         15828
## 6 Bathroom   8226 23.599277046         24054

1.Drop the observations missin in “Price”.

house_clean <- house%>%
  filter(!is.na(Price))

2.The observation missing value in “Distance” also missing in “Bedroom2” and “Bathroom”, therefore, we drop it.

house_clean%>%filter(is.na(Distance))
## # A tibble: 1 x 6
##    Price Rooms  Type Distance Bedroom2 Bathroom
##    <int> <int> <chr>    <dbl>    <int>    <int>
## 1 616000     3     h       NA       NA       NA
house_clean <- house_clean%>%filter(!is.na(Distance))
s_miss_2 <- miss_summary(house_clean)
s_miss_2$miss_case_table
## [[1]]
## # A tibble: 3 x 3
##   n_miss_in_case n_cases    pct_miss
##            <int>   <int>       <dbl>
## 1              0   20800 76.34148132
## 2              1       6  0.02202158
## 3              2    6440 23.63649710
s_miss_2$miss_var_summary
## [[1]]
## # A tibble: 6 x 4
##   variable n_miss pct_miss n_miss_cumsum
##      <chr>  <int>    <dbl>         <int>
## 1    Price      0  0.00000             0
## 2    Rooms      0  0.00000             0
## 3     Type      0  0.00000             0
## 4 Distance      0  0.00000             0
## 5 Bedroom2   6440 23.63650          6440
## 6 Bathroom   6446 23.65852         12886

It implies all the observations missing in “Bedroom2” also missing in “Bathroom” after cleaning “Price” and “Distance”. Only 6 observations missing in “Bathroom” didn’t miss in “Bedroom2”.

house_shadow <- bind_shadow(house_clean)
house_shadow
## # A tibble: 27,246 x 12
##      Price Rooms  Type Distance Bedroom2 Bathroom Price_NA Rooms_NA
##      <int> <int> <chr>    <dbl>    <int>    <int>   <fctr>   <fctr>
##  1 1480000     2     h      2.5        2        1      !NA      !NA
##  2 1035000     2     h      2.5        2        1      !NA      !NA
##  3 1465000     3     h      2.5        3        2      !NA      !NA
##  4  850000     3     h      2.5        3        2      !NA      !NA
##  5 1600000     4     h      2.5        3        1      !NA      !NA
##  6  941000     2     h      2.5        2        1      !NA      !NA
##  7 1876000     3     h      2.5        4        2      !NA      !NA
##  8 1636000     2     h      2.5        2        1      !NA      !NA
##  9 1000000     3     h      2.5       NA       NA      !NA      !NA
## 10  745000     2     t      2.5       NA       NA      !NA      !NA
## # ... with 27,236 more rows, and 4 more variables: Type_NA <fctr>,
## #   Distance_NA <fctr>, Bedroom2_NA <fctr>, Bathroom_NA <fctr>
ggplot(data = house_shadow, aes(x = Distance, y=Rooms, colour=Bedroom2_NA)) +
  scale_colour_brewer(palette="Dark2") +
  facet_wrap(~Type)+
  geom_point(alpha=0.7) + theme(aspect.ratio=1) 

s_miss_group <- house_clean %>% 
  group_by(Type) %>% miss_summary()
s_miss_group$miss_case_table
## [[1]]
## # A tibble: 9 x 4
##    Type n_miss_in_case n_cases    pct_miss
##   <chr>          <int>   <int>       <dbl>
## 1     h              0   15728 85.14969412
## 2     h              1       3  0.01624168
## 3     h              2    2740 14.83406421
## 4     t              0    1579 55.09420796
## 5     t              1       2  0.06978367
## 6     t              2    1285 44.83600837
## 7     u              0    3493 59.11321713
## 8     u              1       1  0.01692334
## 9     u              2    2415 40.86985954
ggplot(house_clean,
       aes(x = Distance,
           y = Bathroom)) +
  scale_colour_brewer(palette="Dark2") +
  geom_miss_point()+
  facet_wrap(~Type) +
  theme(aspect.ratio=1)

ggplot(house_clean,
       aes(x = Rooms,
           y = Bathroom)) +
  scale_colour_brewer(palette="Dark2") +
  geom_miss_point()+
  facet_wrap(~Type) +
  theme(aspect.ratio=1)

library(impute)
house_impute_h <- house_shadow %>%
  arrange(Type, Rooms, Bathroom) %>%
  filter(Type=="h") %>%
  select(Rooms, Bathroom) 
house_impute_h <- impute.knn(as.matrix(house_impute_h), 10)

house_impute_t <- house_shadow %>%
  arrange(Type, Rooms, Bathroom) %>%
  filter(Type=="t") %>%
  select(Rooms, Bathroom) 
house_impute_t <- impute.knn(as.matrix(house_impute_t), 10)

house_impute_u <- house_shadow %>%
  arrange(Type, Rooms, Bathroom) %>%
  filter(Type=="u") %>%
  select(Rooms, Bathroom) 
house_impute_u <- impute.knn(as.matrix(house_impute_u), 10)

house_impute <- rbind(house_impute_h, house_impute_t$data, house_impute_u)
house_shadow_2 <- house_shadow %>%
  arrange(Type, Rooms, Bathroom)

house_shadow_2 <- house_shadow_2%>%
  mutate(Rooms = house_impute$Rooms,
         Bathroom = house_impute$Bathroom)
ggplot(house_shadow_2,
       aes(x = Rooms,
           y = Bathroom, 
           colour=Bathroom_NA)) +
  geom_point(alpha=0.7) + 
  facet_wrap(~Type) + 
  scale_colour_brewer(palette="Dark2") +
  theme(aspect.ratio=1)
Bath_h <- house_shadow %>% 
  filter(Type=="h") %>%
  mutate(Bathroom = ifelse(is.na(Bathroom), 
                             mean(Bathroom, na.rm=TRUE),
                             Bathroom))

Bath_t <- house_shadow %>% 
  filter(Type=="t") %>%
  mutate(Bathroom = ifelse(is.na(Bathroom), 
                             mean(Bathroom, na.rm=TRUE),
                             Bathroom))

Bath_u <- house_shadow %>% 
  filter(Type=="u") %>%
  mutate(Bathroom = ifelse(is.na(Bathroom), 
                             mean(Bathroom, na.rm=TRUE),
                             Bathroom))

house_shadow_3 <- rbind(Bath_h, Bath_t, Bath_u)
ggplot(house_shadow_3,
       aes(x = Rooms,
           y = Bathroom, 
           colour=Bathroom_NA)) +
  geom_point(alpha=0.7) + 
  facet_wrap(~Type) + 
  scale_colour_brewer(palette="Dark2") +
  theme(aspect.ratio=1)

ggplot(house_shadow_3,
       aes(x = Distance,
           y = Bathroom, 
           colour=Bathroom_NA)) +
  geom_point(alpha=0.7) + 
  facet_wrap(~Type) + 
  scale_colour_brewer(palette="Dark2") +
  theme(aspect.ratio=1)

3.Fill missing “Bathroom” by the mean of each type.

Bath_clean_h <- house_clean %>% 
  filter(Type=="h") %>%
  mutate(Bathroom = ifelse(is.na(Bathroom), 
                             mean(Bathroom, na.rm=TRUE),
                             Bathroom))

Bath_clean_t <- house_clean %>% 
  filter(Type=="t") %>%
  mutate(Bathroom = ifelse(is.na(Bathroom), 
                             mean(Bathroom, na.rm=TRUE),
                             Bathroom))
Bath_clean_u <- house_clean %>% 
  filter(Type=="u") %>%
  mutate(Bathroom = ifelse(is.na(Bathroom), 
                             mean(Bathroom, na.rm=TRUE),
                             Bathroom))

house_clean <- rbind(Bath_clean_h, Bath_clean_t, Bath_clean_u)
ggplot(house_clean,
       aes(x = Rooms,
           y = Bedroom2)) +
  scale_colour_brewer(palette="Dark2") +
  geom_miss_point()+
  facet_wrap(~Type) +
  theme(aspect.ratio=1)

ggplot(house_clean,
       aes(x = Distance,
           y = Bedroom2)) +
  scale_colour_brewer(palette="Dark2") +
  geom_miss_point()+
  facet_wrap(~Type) +
  theme(aspect.ratio=1)

Bed_h <- house_shadow %>% 
  filter(Type=="h") %>%
  mutate(Bedroom2 = ifelse(is.na(Bedroom2), 
                             mean(Bedroom2, na.rm=TRUE),
                             Bedroom2))

Bed_t <- house_shadow %>% 
  filter(Type=="t") %>%
  mutate(Bedroom2 = ifelse(is.na(Bedroom2), 
                             mean(Bedroom2, na.rm=TRUE),
                             Bedroom2))

Bed_u <- house_shadow %>% 
  filter(Type=="u") %>%
  mutate(Bedroom2 = ifelse(is.na(Bedroom2), 
                             mean(Bedroom2, na.rm=TRUE),
                             Bedroom2))

house_shadow_4 <- rbind(Bed_h, Bed_t, Bed_u)
ggplot(house_shadow_4,
       aes(x = Rooms,
           y = Bedroom2, 
           colour=Bedroom2_NA)) +
  geom_point(alpha=0.7) + 
  facet_wrap(~Type) + 
  scale_colour_brewer(palette="Dark2") +
  theme(aspect.ratio=1)

ggplot(house_shadow_4,
       aes(x = Distance,
           y = Bedroom2, 
           colour=Bedroom2_NA)) +
  geom_point(alpha=0.7) + 
  facet_wrap(~Type) + 
  scale_colour_brewer(palette="Dark2") +
  theme(aspect.ratio=1)

4.Fill missing “Bedroom2” by the mean of each type.

Bed_clean_h <- house_clean %>% 
  filter(Type=="h") %>%
  mutate(Bedroom2 = ifelse(is.na(Bedroom2), 
                             mean(Bedroom2, na.rm=TRUE),
                             Bedroom2))

Bed_clean_t <- house_clean %>% 
  filter(Type=="t") %>%
  mutate(Bedroom2 = ifelse(is.na(Bedroom2), 
                             mean(Bedroom2, na.rm=TRUE),
                             Bedroom2))
Bed_clean_u <- house_clean %>% 
  filter(Type=="u") %>%
  mutate(Bedroom2 = ifelse(is.na(Bedroom2), 
                             mean(Bedroom2, na.rm=TRUE),
                             Bedroom2))

house_clean <- rbind(Bed_clean_h, Bed_clean_t, Bed_clean_u)
house_clean$Type <- as.factor(house_clean$Type)
summary(house_clean)
##      Price              Rooms        Type         Distance    
##  Min.   :   85000   Min.   : 1.000   h:18471   Min.   : 0.00  
##  1st Qu.:  635000   1st Qu.: 2.000   t: 2866   1st Qu.: 6.40  
##  Median :  870000   Median : 3.000   u: 5909   Median :10.50  
##  Mean   : 1050189   Mean   : 2.992             Mean   :11.28  
##  3rd Qu.: 1295000   3rd Qu.: 4.000             3rd Qu.:14.00  
##  Max.   :11200000   Max.   :16.000             Max.   :48.10  
##     Bedroom2         Bathroom    
##  Min.   : 0.000   Min.   :0.000  
##  1st Qu.: 2.000   1st Qu.:1.000  
##  Median : 3.000   Median :1.656  
##  Mean   : 2.969   Mean   :1.574  
##  3rd Qu.: 3.302   3rd Qu.:2.000  
##  Max.   :20.000   Max.   :9.000
str(house_clean)
## Classes 'tbl_df', 'tbl' and 'data.frame':    27246 obs. of  6 variables:
##  $ Price   : int  1480000 1035000 1465000 850000 1600000 941000 1876000 1636000 1000000 1097000 ...
##  $ Rooms   : int  2 2 3 3 4 2 3 2 3 2 ...
##  $ Type    : Factor w/ 3 levels "h","t","u": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Distance: num  2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
##  $ Bedroom2: num  2 2 3 3 3 ...
##  $ Bathroom: num  1 1 2 2 1 ...

Relating Price to number of rooms:

ggplot(house_clean, mapping = aes(x=Rooms, y= Price)) + geom_point()

Relating Price to type:

ggplot(house_clean, mapping = aes(x=Type, y= Price)) + geom_boxplot()

This plot suggests that houses have a larger price range than townhouses and units. The price range of townhouses is the smallest compared to the other types. Also, houses have a higher median price than townhouses, which have a higher median price than units.

Relating Price to Distance:

ggplot(house_clean, mapping = aes(x=Distance, y= Price)) + geom_point()

This plot is positively skewed. This means that there is a more dense popluation of houses with smaller distances from the CBD (between 1km and 19km) than larger distances. In general, the further away the house is from the CBD, the less expensive. There are a few outliers. For example, there is a very expensive house that is only 10km form the CBD. This suggests that there must be another factor or factors affecting the price, such as size of the house or amount of land.

Relating Price to Bedrooms:

ggplot(house_clean, mapping = aes(x=Bedroom2, y= Price)) + geom_point()

Most of the properties have between 3-5 bedrooms. Houses with less than 3 bedrooms and more than 5 bedrooms are less expensive than those with 3-5 bedrooms

Relating Price to Bathroom:

ggplot(house_clean, mapping = aes(x=Bathroom, y= Price)) + geom_point()

This plot is slightly positively skewed. This means that there is a higher number of houses with a lower number of bathrooms (1-4) than with a higher number of bathrooms (more than 6).

Interactions between Type other variables

1.Interactions between Rooms and Type

mod1_data <- house_clean%>%select(Price,Rooms,Type)
mod1_1 <- lm(Price~Rooms+Type, data = mod1_data)
mod1_2 <- lm(Price~Rooms*Type, data = mod1_data)
tidy(mod1_1)
##          term  estimate std.error statistic       p.value
## 1 (Intercept)  360858.6 14809.522  24.36666 9.468044e-130
## 2       Rooms  253786.7  4282.258  59.26469  0.000000e+00
## 3       Typet -162362.5 11416.839 -14.22131  9.853357e-46
## 4       Typeu -244354.0 10079.434 -24.24283 1.810348e-128
tidy(mod1_2)
##          term     estimate std.error   statistic      p.value
## 1 (Intercept)  349978.3807 16629.967 21.04504344 1.519148e-97
## 2       Rooms  257062.6348  4850.401 52.99821904 0.000000e+00
## 3       Typet -162088.5943 48379.835 -3.35033373 8.082384e-04
## 4       Typeu -193276.6625 29201.844 -6.61864583 3.692319e-11
## 5 Rooms:Typet     398.4062 16064.164  0.02480093 9.802139e-01
## 6 Rooms:Typeu  -23222.5920 12340.508 -1.88181821 5.987136e-02
a_mod1_1 <- augment(mod1_1, mod1_data)
ggplot(mod1_data, aes(x = Rooms, y = Price, color = Type))+
  geom_point()+
  facet_wrap(~Type)+
  geom_line(data=a_mod1_1, aes(y=.fitted), color="Black")+
  theme(aspect.ratio=1)+ 
  ggtitle("Model 1")

a_mod1_2 <- augment(mod1_2, mod1_data)
ggplot(mod1_data, aes(x = Rooms, y = Price, color = Type))+
  geom_point()+
  facet_wrap(~Type)+
  geom_line(data=a_mod1_2, aes(y=.fitted), color="Black")+
  theme(aspect.ratio=1)+ 
  ggtitle("Model 2")

p1 <- ggplot(a_mod1_1, aes(x=Rooms, y=.fitted,
                            colour=Type)) +
  geom_line()+
  ggtitle("Model 1")
p2 <- ggplot(a_mod1_2, aes(x=Rooms, y=.fitted,
                            colour=Type)) +
  geom_line()+
  ggtitle("Model 2")
grid.arrange(p1, p2, ncol=2)

rbind(glance(mod1_1),glance(mod1_2))
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.2350728     0.2349886 561064.4  2790.615       0  4 -399329.8 798669.6
## 2 0.2351740     0.2350336 561047.9  1675.189       0  6 -399328.0 798670.0
##        BIC     deviance df.residual
## 1 798710.7 8.575597e+15       27242
## 2 798727.5 8.574463e+15       27240

The model 1 and 2 build base on “Rooms” and “Type”. R.squared and adj.r.squared slightly increases in model2 but AIC and BIC increases in Model1. Therefore, the model1 without interaction is better.

2.Interactions between Distance and Type

mod2_data <- house_clean%>%select(Price,Distance,Type)
mod2_1 <- lm(Price~Distance+Type, data = mod2_data)
mod2_2 <- lm(Price~Distance*Type, data = mod2_data)
tidy(mod2_1)
##          term   estimate std.error statistic       p.value
## 1 (Intercept) 1579224.08  7605.880 207.63200  0.000000e+00
## 2    Distance  -30408.93   517.161 -58.79973  0.000000e+00
## 3       Typet -325103.48 11309.362 -28.74640 4.744744e-179
## 4       Typeu -700014.11  8654.707 -80.88247  0.000000e+00
tidy(mod2_2)
##             term   estimate  std.error statistic       p.value
## 1    (Intercept) 1657355.33  8159.7957 203.11236  0.000000e+00
## 2       Distance  -36736.63   572.0076 -64.22402  0.000000e+00
## 3          Typet -637679.99 26393.9655 -24.16007 1.290829e-127
## 4          Typeu -996862.14 15582.0299 -63.97511  0.000000e+00
## 5 Distance:Typet   28396.66  2226.3230  12.75496  3.737148e-37
## 6 Distance:Typeu   32797.37  1464.1195  22.40075 3.829478e-110
a_mod2_1 <- augment(mod2_1, mod2_data)
ggplot(mod2_data, aes(x = Distance, y = Price, color = Type))+
  geom_point()+
  facet_wrap(~Type)+
  geom_line(data=a_mod2_1, aes(y=.fitted), color="Black")+
  theme(aspect.ratio=1)+ 
  ggtitle("Model 3")

a_mod2_2 <- augment(mod2_2, mod2_data)
ggplot(mod2_data, aes(x = Distance, y = Price, color = Type))+
  geom_point()+
  facet_wrap(~Type)+
  geom_line(data=a_mod2_2, aes(y=.fitted), color="Black")+
  theme(aspect.ratio=1)+ 
  ggtitle("Model 4")

p3 <- ggplot(a_mod2_1, aes(x=Distance, y=.fitted,
                            colour=Type)) +
  geom_line()+
  ggtitle("Model 3")
p4 <- ggplot(a_mod2_2, aes(x=Distance, y=.fitted,
                            colour=Type)) +
  geom_line()+
  ggtitle("Model 4")
grid.arrange(p3, p4, ncol=2)

rbind(glance(mod2_1),glance(mod2_2))
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1  0.233705     0.2336206 561565.8  2769.426       0  4 -399354.1 798718.3
## 2  0.250578     0.2504405 555369.2  1821.603       0  6 -399050.8 798115.6
##        BIC     deviance df.residual
## 1 798759.3 8.590932e+15       27242
## 2 798173.1 8.401768e+15       27240

The model 3 and 4 build base on “Distance” and “Type”. R.squared and adj.r.squared increases in model4, also AIC and BIC decrease in Model4. Therefore, the model4 with interaction is better.

3.Interactions between Bedroom2 and Type

mod3_data <- house_clean%>%select(Price,Bedroom2,Type)
mod3_1 <- lm(Price~Bedroom2+Type, data = mod3_data)
mod3_2 <- lm(Price~Bedroom2*Type, data = mod3_data)
tidy(mod3_1)
##          term  estimate std.error statistic       p.value
## 1 (Intercept)  431491.4 16549.297  26.07310 4.832269e-148
## 2    Bedroom2  233897.5  4847.452  48.25165  0.000000e+00
## 3       Typet -168967.7 11686.731 -14.45808  3.335091e-47
## 4       Typeu -266924.9 10682.262 -24.98768 2.855176e-136
tidy(mod3_2)
##             term   estimate std.error  statistic       p.value
## 1    (Intercept)  430412.67 17892.620 24.0553184 1.536410e-126
## 2       Bedroom2  234224.27  5267.057 44.4696673  0.000000e+00
## 3          Typet -203696.26 63854.878 -3.1899875  1.424401e-03
## 4          Typeu -247329.95 35994.023 -6.8714173  6.493532e-12
## 5 Bedroom2:Typet   12200.70 21763.499  0.5606037  5.750723e-01
## 6 Bedroom2:Typeu   -9673.09 16191.124 -0.5974317  5.502242e-01
a_mod3_1 <- augment(mod3_1, mod3_data)
ggplot(mod3_data, aes(x = Bedroom2, y = Price, color = Type))+
  geom_point()+
  facet_wrap(~Type)+
  geom_line(data=a_mod3_1, aes(y=.fitted), color="Black")+
  theme(aspect.ratio=1)+ 
  ggtitle("Model 5")

a_mod3_2 <- augment(mod3_2, mod3_data)
ggplot(mod3_data, aes(x = Bedroom2, y = Price, color = Type))+
  geom_point()+
  facet_wrap(~Type)+
  geom_line(data=a_mod3_2, aes(y=.fitted), color="Black")+
  theme(aspect.ratio=1)+ 
  ggtitle("Model 6")

p5 <- ggplot(a_mod3_1, aes(x=Bedroom2, y=.fitted,
                            colour=Type)) +
  geom_line()+
  ggtitle("Model 5")
p6 <- ggplot(a_mod3_2, aes(x=Bedroom2, y=.fitted,
                            colour=Type)) +
  geom_line()+
  ggtitle("Model 6")
grid.arrange(p5, p6, ncol=2)

rbind(glance(mod3_1),glance(mod3_2))
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.2044428     0.2043552 572187.5  2333.555       0  4 -399864.7 799739.3
## 2 0.2044640     0.2043180 572200.8  1400.213       0  6 -399864.3 799742.6
##        BIC     deviance df.residual
## 1 799780.4 8.918990e+15       27242
## 2 799800.1 8.918752e+15       27240

The model 5 and 6 build base on “Bedroom2” and “Type”. R.squared and adj.r.squared slightly increases in model6 but AIC and BIC increases in Model6. Therefore, the model5 without interaction is better.

4.Interactions between Bathroom and Type

mod4_data <- house_clean%>%select(Price,Bathroom,Type)
mod4_1 <- lm(Price~Bathroom+Type, data = mod4_data)
mod4_2 <- lm(Price~Bathroom*Type, data = mod4_data)
tidy(mod4_1)
##          term  estimate std.error statistic       p.value
## 1 (Intercept)  620245.0 10354.965  59.89831  0.000000e+00
## 2    Bathroom  352284.8  5737.873  61.39642  0.000000e+00
## 3       Typet -337960.8 11267.569 -29.99412 1.705965e-194
## 4       Typeu -410646.1  8773.218 -46.80678  0.000000e+00
tidy(mod4_2)
##             term   estimate std.error  statistic      p.value
## 1    (Intercept)  607539.64 10965.704  55.403613 0.000000e+00
## 2       Bathroom  359955.53  6137.748  58.646193 0.000000e+00
## 3          Typet -205305.58 44897.026  -4.572810 4.833423e-06
## 4          Typeu -349051.68 29983.711 -11.641377 3.010710e-31
## 5 Bathroom:Typet  -72801.70 23757.607  -3.064353 2.183544e-03
## 6 Bathroom:Typeu  -48839.83 23504.652  -2.077879 3.772983e-02
a_mod4_1 <- augment(mod4_1, mod4_data)
ggplot(mod4_data, aes(x = Bathroom, y = Price, color = Type))+
  geom_point()+
  facet_wrap(~Type)+
  geom_line(data=a_mod4_1, aes(y=.fitted), color="Black")+
  theme(aspect.ratio=1)+ 
  ggtitle("Model 7")

a_mod4_2 <- augment(mod4_2, mod4_data)
ggplot(mod4_data, aes(x = Bathroom, y = Price, color = Type))+
  geom_point()+
  facet_wrap(~Type)+
  geom_line(data=a_mod4_2, aes(y=.fitted), color="Black")+
  theme(aspect.ratio=1)+ 
  ggtitle("Model 8")

p7 <- ggplot(a_mod4_1, aes(x=Bathroom, y=.fitted,
                            colour=Type)) +
  geom_line()+
  ggtitle("Model 7")
p8 <- ggplot(a_mod4_2, aes(x=Bathroom, y=.fitted,
                            colour=Type)) +
  geom_line()+
  ggtitle("Model 8")
grid.arrange(p7, p8, ncol=2)

rbind(glance(mod4_1),glance(mod4_2))
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.2414173     0.2413337 558732.7  2889.902       0  4 -399216.3 798442.7
## 2 0.2417766     0.2416374 558620.9  1737.217       0  6 -399209.9 798433.8
##        BIC     deviance df.residual
## 1 798483.7 8.504469e+15       27242
## 2 798491.3 8.500441e+15       27240

The model 7 and 8 build base on “Bthroom” and “Type”. R.squared and adj.r.squared slightly increases in model8. And AIC decreases but BIC increases in model8. Therefore, model8 with interaction is better.

comb, interaction bwtween type and other variables

mod9 <- lm(Price~Type+Rooms+Bedroom2+Distance*Type+Bathroom*Type, data = house_clean)
tidy(mod9)
##              term   estimate  std.error  statistic       p.value
## 1     (Intercept)  663264.48 15006.4190  44.198718  0.000000e+00
## 2           Typet -426691.60 43218.0747  -9.872990  5.957678e-23
## 3           Typeu -546583.56 28745.2765 -19.014726  4.259709e-80
## 4           Rooms  259856.84  6736.8833  38.572264 5.846387e-317
## 5        Bedroom2  -73893.54  7844.2773  -9.420057  4.853471e-21
## 6        Distance  -42916.18   506.5156 -84.728237  0.000000e+00
## 7        Bathroom  272476.13  6484.9290  42.016826  0.000000e+00
## 8  Typet:Distance   27429.38  1950.8533  14.060197  9.566318e-45
## 9  Typeu:Distance   31833.69  1284.0267  24.792079 3.362211e-134
## 10 Typet:Bathroom  -98648.54 20679.5595  -4.770340  1.848666e-06
## 11 Typeu:Bathroom  -82537.18 20420.7129  -4.041836  5.317990e-05
glance(mod9)
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.4282488     0.4280389 485134.3  2039.936       0 11 -395364.5 790752.9
##        BIC     deviance df.residual
## 1 790851.5 6.409901e+15       27235

Add interactions between Rooms and other vaiables to improve model

1. Add interaction between Bathroom and Rooms

mod10 <- lm(Price~Type+Rooms+Bedroom2+Distance*Type+Bathroom*Type+Bathroom*Rooms, data = house_clean)
tidy(mod10)
##              term    estimate  std.error  statistic       p.value
## 1     (Intercept)  710162.301 26934.4256  26.366343 2.648267e-151
## 2           Typet -435324.966 43411.0994 -10.027965  1.262353e-23
## 3           Typeu -567783.778 30470.0532 -18.634158  5.120695e-77
## 4           Rooms  247765.719  8867.6353  27.940450 2.115589e-169
## 5        Bedroom2  -75058.449  7863.4404  -9.545243  1.467599e-21
## 6        Distance  -42829.374   508.1733 -84.281046  0.000000e+00
## 7        Bathroom  244062.214 15023.2178  16.245668  4.546691e-59
## 8  Typet:Distance   27330.638  1951.3000  14.006374  2.032623e-44
## 9  Typeu:Distance   31870.788  1284.0686  24.820161 1.699627e-134
## 10 Typet:Bathroom  -92113.081 20911.8771  -4.404821  1.062746e-05
## 11 Typeu:Bathroom  -69505.799 21344.3612  -3.256401  1.129722e-03
## 12 Rooms:Bathroom    7406.735  3532.5543   2.096708  3.602862e-02
rbind(glance(mod9),glance(mod10))
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.4282488     0.4280389 485134.3  2039.936       0 11 -395364.5 790752.9
## 2 0.4283411     0.4281102 485104.1  1855.118       0 12 -395362.3 790750.5
##        BIC     deviance df.residual
## 1 790851.5 6.409901e+15       27235
## 2 790857.3 6.408867e+15       27234

Drop mod10 base on BIC, drop“Bathroom*Rooms“, use mod 9 to continue.

2. Add interaction between Bedroom2 and Rooms

mod11 <- lm(Price~Type+Rooms+Bedroom2+Distance*Type+Bathroom*Type+Bedroom2*Rooms, data = house_clean)
tidy(mod11)
##              term    estimate  std.error  statistic       p.value
## 1     (Intercept)  552136.696 26239.7239  21.042016  1.618282e-97
## 2           Typet -407161.197 43363.1481  -9.389567  6.479729e-21
## 3           Typeu -503107.825 29940.9643 -16.803327  4.797209e-63
## 4           Rooms  289335.533  8829.5610  32.768960 5.014651e-231
## 5        Bedroom2  -37998.412 10480.3632  -3.625677  2.887308e-04
## 6        Distance  -43079.248   507.2622 -84.925014  0.000000e+00
## 7        Bathroom  277489.652  6554.2522  42.337348  0.000000e+00
## 8  Typet:Distance   27410.289  1949.9391  14.056998  1.000546e-44
## 9  Typeu:Distance   31367.783  1286.5930  24.380501 6.808250e-130
## 10 Typet:Bathroom -110236.934 20791.4077  -5.302043  1.154059e-07
## 11 Typeu:Bathroom -101819.823 20750.1760  -4.906938  9.304318e-07
## 12 Rooms:Bedroom2   -9658.645  1871.2777  -5.161524  2.466664e-07
rbind(glance(mod9),glance(mod11))
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.4282488     0.4280389 485134.3  2039.936       0 11 -395364.5 790752.9
## 2 0.4288076     0.4285768 484906.1  1858.655       0 12 -395351.1 790728.3
##        BIC     deviance df.residual
## 1 790851.5 6.409901e+15       27235
## 2 790835.1 6.403637e+15       27234

Improvement in mod11, use mod11 to continue.

3. Add interaction between Distance and Rooms

mod12 <- lm(Price~Type+Rooms+Bedroom2+Distance*Type+Bathroom*Type+Bedroom2*Rooms+Distance*Rooms, data = house_clean)
tidy(mod12)
##              term    estimate std.error   statistic       p.value
## 1     (Intercept)  224859.170 29920.035   7.5153379  5.850877e-14
## 2           Typet -337732.786 43093.808  -7.8371534  4.776245e-15
## 3           Typeu -344537.284 30529.218 -11.2854933  1.799572e-29
## 4           Rooms  389081.103  9844.426  39.5229854  0.000000e+00
## 5        Bedroom2  -84467.060 10597.825  -7.9702257  1.645430e-15
## 6        Distance    1665.373  2083.953   0.7991412  4.242155e-01
## 7        Bathroom  277674.225  6496.255  42.7437397  0.000000e+00
## 8  Typet:Distance   19681.191  1964.002  10.0209618  1.354733e-23
## 9  Typeu:Distance   15400.310  1465.270  10.5102224  8.684685e-26
## 10 Typet:Bathroom -104220.654 20609.205  -5.0569952  4.286751e-07
## 11 Typeu:Bathroom -102939.387 20566.607  -5.0051712  5.616012e-07
## 12 Rooms:Bedroom2    3564.886  1948.642   1.8294210  6.734749e-02
## 13 Rooms:Distance  -13233.349   598.129 -22.1245715 1.626210e-107
rbind(glance(mod11),glance(mod12))
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.4288076     0.4285768 484906.1  1858.655       0 12 -395351.1 790728.3
## 2 0.4388931     0.4386459 480614.9  1775.119       0 13 -395108.5 790244.9
##        BIC     deviance df.residual
## 1 790835.1 6.403637e+15       27234
## 2 790359.9 6.290568e+15       27233

Improvement in mod12, use mod12 to continue.

Add interactions between Distance and other vaiables to improve model

1. Add interaction between Bathroom and Distance

mod13 <- lm(Price~Type+Rooms+Bedroom2+Distance*Type+Bathroom*Type+Bedroom2*Rooms+Distance*Rooms+Bathroom*Distance, data = house_clean)
tidy(mod13)
##                 term    estimate  std.error  statistic       p.value
## 1        (Intercept)  156903.413 30283.7042   5.181117  2.221319e-07
## 2              Typet -358794.096 42992.3197  -8.345539  7.420634e-17
## 3              Typeu -300330.573 30625.3576  -9.806598  1.149884e-22
## 4              Rooms  341695.385 10470.4023  32.634408 3.475372e-229
## 5           Bedroom2  -82133.044 10566.8720  -7.772692  7.953696e-15
## 6           Distance    6261.587  2107.4872   2.971115  2.969797e-03
## 7           Bathroom  411439.140 12165.7279  33.819525 1.233299e-245
## 8     Typet:Distance   23678.688  1982.0238  11.946723  8.158345e-33
## 9     Typeu:Distance   15420.144  1460.7796  10.556106  5.343957e-26
## 10    Typet:Bathroom -117935.486 20573.1553  -5.732494  1.000094e-08
## 11    Typeu:Bathroom -140804.177 20709.7814  -6.798921  1.075813e-11
## 12    Rooms:Bedroom2    2969.983  1943.2093   1.528390  1.264272e-01
## 13    Rooms:Distance   -8720.388   690.1405 -12.635670  1.699821e-36
## 14 Distance:Bathroom  -11646.456   896.6672 -12.988605  1.847588e-38
rbind(glance(mod12),glance(mod13))
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.4388931     0.4386459 480614.9  1775.119       0 13 -395108.5 790244.9
## 2 0.4423478     0.4420816 479141.8  1661.639       0 14 -395024.3 790078.6
##        BIC     deviance df.residual
## 1 790359.9 6.290568e+15       27233
## 2 790201.8 6.251838e+15       27232

Improvement in mod13, use mod13 to continue.

2. Add interaction between Bathroom and Distance

mod14 <- lm(Price~Type+Rooms+Bedroom2+Distance*Type+Bathroom*Type+Bedroom2*Rooms+Distance*Rooms+Bathroom*Distance+Bedroom2*Distance, data = house_clean)
tidy(mod14)
##                 term    estimate  std.error  statistic       p.value
## 1        (Intercept)  165463.994 30948.6152   5.346410  9.043753e-08
## 2              Typet -360920.652 43020.9183  -8.389422  5.118836e-17
## 3              Typeu -304686.188 30796.6213  -9.893494  4.858561e-23
## 4              Rooms  356033.693 14963.9339  23.792787 7.297197e-124
## 5           Bedroom2 -100042.968 17028.8124  -5.874923  4.279547e-09
## 6           Distance    5021.048  2301.5042   2.181638  2.914480e-02
## 7           Bathroom  415835.033 12599.3422  33.004503 2.898006e-234
## 8     Typet:Distance   23995.092  1995.9856  12.021676  3.318019e-33
## 9     Typeu:Distance   16072.559  1539.6252  10.439267  1.832844e-25
## 10    Typet:Bathroom -118321.491 20574.8667  -5.750778  8.978324e-09
## 11    Typeu:Bathroom -141869.555 20724.7067  -6.845431  7.786301e-12
## 12    Rooms:Bedroom2    2585.407  1964.2235   1.316249  1.881017e-01
## 13    Rooms:Distance   -9980.675  1165.8843  -8.560605  1.180983e-17
## 14 Distance:Bathroom  -12050.539   945.9193 -12.739500  4.551563e-37
## 15 Bedroom2:Distance    1840.777  1372.5037   1.341182  1.798726e-01
rbind(glance(mod13),glance(mod14))
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.4423478     0.4420816 479141.8  1661.639       0 14 -395024.3 790078.6
## 2 0.4423846     0.4420980 479134.8  1543.124       0 15 -395023.4 790078.8
##        BIC     deviance df.residual
## 1 790201.8 6.251838e+15       27232
## 2 790210.2 6.251425e+15       27231

Drop mod14 base on BIC, drop “Bedroom2*Distance’, use mod13 to continue.

Add interactions between Bedroom2 and Bathroom to improve model

mod15 <- lm(Price~Type+Rooms+Bedroom2+Distance*Type+Bathroom*Type+Bedroom2*Rooms+Distance*Rooms+Bathroom*Distance+Bedroom2*Bathroom, data = house_clean)
tidy(mod15)
##                 term    estimate  std.error  statistic       p.value
## 1        (Intercept)  203275.449 31285.1110   6.497514  8.306751e-11
## 2              Typet -372516.655 43030.0159  -8.657135  5.100189e-18
## 3              Typeu -328808.155 30991.0095 -10.609792  3.019760e-26
## 4              Rooms  375895.628 11985.0126  31.363807 3.760122e-212
## 5           Bedroom2  -89810.691 10641.5867  -8.439596  3.340253e-17
## 6           Distance    5953.199  2106.8606   2.825626  4.722271e-03
## 7           Bathroom  314615.923 20530.3218  15.324452  8.721998e-53
## 8     Typet:Distance   23599.606  1980.8608  11.913814  1.208936e-32
## 9     Typeu:Distance   14995.610  1461.6893  10.259096  1.194389e-24
## 10    Typet:Bathroom -106391.742 20654.9891  -5.150898  2.610471e-07
## 11    Typeu:Bathroom -115308.602 21150.5933  -5.451790  5.029803e-08
## 12    Rooms:Bedroom2   -8928.799  2811.4988  -3.175814  1.495830e-03
## 13    Rooms:Distance   -8367.288   692.3530 -12.085292  1.539512e-33
## 14 Distance:Bathroom  -12086.729   899.2719 -13.440572  4.728247e-41
## 15 Bedroom2:Bathroom   27869.158  4761.6525   5.852833  4.888023e-09
rbind(glance(mod13),glance(mod15))
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.4423478     0.4420816 479141.8  1661.639       0 14 -395024.3 790078.6
## 2 0.4430484     0.4427621 478849.5  1547.281       0 15 -395007.2 790046.4
##        BIC     deviance df.residual
## 1 790201.8 6.251838e+15       27232
## 2 790177.8 6.243983e+15       27231

Improvement in mod15, use mod15 to continue.

Add 3-variables interactions to improve model

1.Type,Distance and Bathroom

mod16 <- lm(Price~Type+Rooms+Bedroom2+Distance*Type+Bathroom*Type+Bedroom2*Rooms+Distance*Rooms+Bathroom*Distance+Bedroom2*Bathroom+Type*Distance*Bathroom, data = house_clean)
tidy(mod16)
##                       term    estimate  std.error   statistic
## 1              (Intercept)  189679.811 31506.4646   6.0203458
## 2                    Typet  -52785.628 91105.2170  -0.5793919
## 3                    Typeu -273101.215 47643.9715  -5.7321253
## 4                    Rooms  374596.304 11986.0836  31.2526023
## 5                 Bedroom2  -90371.788 10640.9340  -8.4928435
## 6                 Distance    7290.699  2139.8901   3.4070435
## 7                 Bathroom  325683.894 20757.3553  15.6900477
## 8           Typet:Distance   -7733.806  8124.4217  -0.9519208
## 9           Typeu:Distance    8362.646  4796.9940   1.7433098
## 10          Typet:Bathroom -285039.717 49342.7532  -5.7767291
## 11          Typeu:Bathroom -158331.092 35996.2689  -4.3985418
## 12          Rooms:Bedroom2   -8591.964  2812.2204  -3.0552241
## 13          Rooms:Distance   -8359.437   692.1602 -12.0773140
## 14       Distance:Bathroom  -12887.120   926.6322 -13.9074811
## 15       Bedroom2:Bathroom   27424.917  4761.8218   5.7593328
## 16 Typet:Distance:Bathroom   17335.574  4355.9057   3.9797863
## 17 Typeu:Distance:Bathroom    5167.350  3695.2765   1.3983662
##          p.value
## 1   1.762708e-09
## 2   5.623295e-01
## 3   1.002268e-08
## 4  1.087606e-210
## 5   2.117675e-17
## 6   6.576559e-04
## 7   3.089860e-55
## 8   3.411456e-01
## 9   8.129081e-02
## 10  7.699546e-09
## 11  1.093938e-05
## 12  2.251098e-03
## 13  1.695532e-33
## 14  8.059078e-44
## 15  8.535507e-09
## 16  6.915593e-05
## 17  1.620145e-01
rbind(glance(mod15),glance(mod16))
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.4430484     0.4427621 478849.5  1547.281       0 15 -395007.2 790046.4
## 2 0.4434036     0.4430765 478714.4  1355.722       0 17 -394998.5 790033.0
##        BIC     deviance df.residual
## 1 790177.8 6.243983e+15       27231
## 2 790180.8 6.240001e+15       27229

Improvement base on r.squared, adj.r.squared ,deviance and AIC, just slightly increase in BIC, Overall mod 16 is better, use mod 16 to continue

2.Type,Distance and Rooms

mod17 <- lm(Price~Type+Rooms+Bedroom2+Distance*Type+Bathroom*Type+Bedroom2*Rooms+Distance*Rooms+Bathroom*Distance+Bedroom2*Bathroom+Type*Distance*Bathroom+Type*Distance*Rooms, data = house_clean)
tidy(mod17)
##                       term     estimate   std.error   statistic
## 1              (Intercept)  153335.9339  34079.7792   4.4993230
## 2                    Typet  160337.9267 110819.5919   1.4468374
## 3                    Typeu -232745.3473  56814.8426  -4.0965589
## 4                    Rooms  399003.2372  15222.4403  26.2115159
## 5                 Bedroom2  -87543.1379  10700.6154  -8.1811312
## 6                 Distance    8341.2919   2238.5944   3.7261291
## 7                 Bathroom  303632.1249  22267.0071  13.6359648
## 8           Typet:Distance  -27109.6059   9813.3632  -2.7625194
## 9           Typeu:Distance    9930.7861   5758.4076   1.7245716
## 10          Typet:Bathroom -199280.8888  55257.3461  -3.6064144
## 11          Typeu:Bathroom -139153.5706  38738.4534  -3.5921303
## 12          Rooms:Bedroom2  -11826.7408   3137.0716  -3.7699940
## 13          Rooms:Distance   -8961.5695    756.3693 -11.8481394
## 14       Distance:Bathroom  -12364.8282    952.7664 -12.9778166
## 15       Bedroom2:Bathroom   31094.7155   4987.6865   6.2342963
## 16             Typet:Rooms -131877.5737  39314.2760  -3.3544449
## 17             Typeu:Rooms  -28403.6718  22053.8185  -1.2879253
## 18 Typet:Distance:Bathroom    9357.9669   4876.3165   1.9190647
## 19 Typeu:Distance:Bathroom    5924.2878   3986.1382   1.4862224
## 20    Typet:Rooms:Distance   12015.5136   3370.7415   3.5646499
## 21    Typeu:Rooms:Distance    -966.2631   2166.5391  -0.4459939
##          p.value
## 1   6.845210e-06
## 2   1.479540e-01
## 3   4.205493e-05
## 4  1.412490e-149
## 5   2.933378e-16
## 6   1.948357e-04
## 7   3.368379e-42
## 8   5.739564e-03
## 9   8.461606e-02
## 10  3.110122e-04
## 11  3.285635e-04
## 12  1.635962e-04
## 13  2.641787e-32
## 14  2.125431e-38
## 15  4.604711e-10
## 16  7.963290e-04
## 17  1.977829e-01
## 18  5.498659e-02
## 19  1.372319e-01
## 20  3.649657e-04
## 21  6.556052e-01
rbind(glance(mod16),glance(mod17))
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.4434036     0.4430765 478714.4  1355.722       0 17 -394998.5 790033.0
## 2 0.4438100     0.4434014 478574.7  1086.205       0 21 -394988.6 790021.1
##        BIC     deviance df.residual
## 1 790180.8 6.240001e+15       27229
## 2 790201.8 6.235445e+15       27225

Improvement base on r.squared, adj.r.squared ,deviance and AIC, just slightly increase in BIC, Overall mod 17 is better, use mod 17 to continue

2.Type,Distance and Bedroom2

mod18 <- lm(Price~Type+Rooms+Bedroom2+Distance*Type+Bathroom*Type+Bedroom2*Rooms+Distance*Rooms+Bathroom*Distance+Bedroom2*Bathroom+Type*Distance*Bathroom+Type*Distance*Rooms+Type*Distance*Bedroom2, data = house_clean)
tidy(mod18)
##                       term      estimate   std.error    statistic
## 1              (Intercept)  169347.63521  34998.4482   4.83871840
## 2                    Typet  200533.59581 125923.3403   1.59250537
## 3                    Typeu -252987.55105  62384.0937  -4.05532142
## 4                    Rooms  424402.45478  20818.0364  20.38628650
## 5                 Bedroom2 -121514.98986  20698.8640  -5.87061155
## 6                 Distance    7118.20546   2399.5166   2.96651643
## 7                 Bathroom  310488.20337  22664.7750  13.69915223
## 8           Typet:Distance  -30174.33867  11309.9352  -2.66794974
## 9           Typeu:Distance    9456.36195   6257.1535   1.51128815
## 10          Typet:Bathroom -182834.68192  60324.6025  -3.03084768
## 11          Typeu:Bathroom -143315.26472  41169.9793  -3.48106235
## 12          Rooms:Bedroom2  -11672.78329   3177.1618  -3.67396561
## 13          Rooms:Distance  -10974.27962   1437.1843  -7.63595827
## 14       Distance:Bathroom  -12811.64414    992.5511 -12.90779255
## 15       Bedroom2:Bathroom   30802.12643   5000.7351   6.15951971
## 16             Typet:Rooms -123566.64680  54747.1964  -2.25704063
## 17             Typeu:Rooms  -60955.76448  30861.3012  -1.97515212
## 18          Typet:Bedroom2  -34357.99203  73702.5364  -0.46617109
## 19          Typeu:Bedroom2   41866.78125  40824.4789   1.02553131
## 20       Bedroom2:Distance    2620.30853   1622.3557   1.61512580
## 21 Typet:Distance:Bathroom    8052.68241   5425.1020   1.48433751
## 22 Typeu:Distance:Bathroom    4690.60279   4356.8087   1.07661436
## 23    Typet:Rooms:Distance   11484.61820   4569.3704   2.51339184
## 24    Typeu:Rooms:Distance     182.43529   2865.9145   0.06365692
## 25 Typet:Bedroom2:Distance    2556.61191   6459.5486   0.39578802
## 26 Typeu:Bedroom2:Distance      82.77812   3974.4372   0.02082763
##         p.value
## 1  1.313938e-06
## 2  1.112828e-01
## 3  5.020473e-05
## 4  1.072327e-91
## 5  4.392235e-09
## 6  3.014559e-03
## 7  1.422215e-42
## 8  7.636086e-03
## 9  1.307267e-01
## 10 2.440973e-03
## 11 5.002116e-04
## 12 2.392727e-04
## 13 2.314918e-14
## 14 5.259846e-38
## 15 7.398697e-10
## 16 2.401343e-02
## 17 4.826097e-02
## 18 6.410968e-01
## 19 3.051217e-01
## 20 1.062950e-01
## 21 1.377310e-01
## 22 2.816621e-01
## 23 1.196337e-02
## 24 9.492439e-01
## 25 6.922645e-01
## 26 9.833833e-01
rbind(glance(mod17),glance(mod18))
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.4438100     0.4434014 478574.7 1086.2050       0 21 -394988.6 790021.1
## 2 0.4439511     0.4434404 478558.0  869.3012       0 26 -394985.1 790024.2
##        BIC     deviance df.residual
## 1 790201.8 6.235445e+15       27225
## 2 790245.9 6.233863e+15       27220

Not significant improvement in mod 18, drop“TypeDistanceBedroom2”, use mod 17 to continue.

4-variables interactions to improve model

1.TypeDistanceRooms*Bathroom

mod19 <- lm(Price~Type+Rooms+Bedroom2+Distance*Type+Bathroom*Type+Bedroom2*Rooms+Distance*Rooms+Bathroom*Distance+Bedroom2*Bathroom+Type*Distance*Bathroom+Type*Distance*Rooms+Type*Distance*Rooms*Bathroom, data = house_clean)
tidy(mod19)
##                             term     estimate   std.error  statistic
## 1                    (Intercept)  197037.4274  60078.7490  3.2796526
## 2                          Typet  265137.9541 327949.0300  0.8084731
## 3                          Typeu -125526.2923 166884.8201 -0.7521732
## 4                          Rooms  306849.2792  23488.1264 13.0640169
## 5                       Bedroom2   45898.4173  18206.4912  2.5209919
## 6                       Distance    7502.2076   4285.7436  1.7505032
## 7                       Bathroom  176559.6460  34718.5340  5.0854580
## 8                 Typet:Distance  -33703.2011  29323.4308 -1.1493608
## 9                 Typeu:Distance  -11774.1197  18952.2137 -0.6212530
## 10                Typet:Bathroom -222181.2079 186939.6673 -1.1885183
## 11                Typeu:Bathroom -217065.2160 134803.6052 -1.6102330
## 12                Rooms:Bedroom2  -26792.1267   3675.0170 -7.2903408
## 13                Rooms:Distance   -8788.5811   1278.0110 -6.8767648
## 14             Distance:Bathroom  -11794.7506   2224.2625 -5.3027692
## 15             Bedroom2:Bathroom  -18067.2372   7315.8567 -2.4695997
## 16                   Typet:Rooms -188077.6573 124661.0062 -1.5087128
## 17                   Typeu:Rooms  -86757.1551  67336.5972 -1.2884101
## 18                Rooms:Bathroom   83966.4173  11001.1610  7.6325050
## 19       Typet:Distance:Bathroom   12716.8594  16707.7471  0.7611355
## 20       Typeu:Distance:Bathroom   23288.5469  15676.3234  1.4855873
## 21          Typet:Rooms:Distance   15148.1184  10866.3374  1.3940409
## 22          Typeu:Rooms:Distance    7739.1019   7006.6495  1.1045367
## 23          Typet:Rooms:Bathroom   20254.1479  66564.3271  0.3042793
## 24          Typeu:Rooms:Bathroom   44063.9855  51836.9421  0.8500499
## 25       Rooms:Distance:Bathroom    -100.8202    538.4634 -0.1872368
## 26 Typet:Rooms:Distance:Bathroom   -1634.1501   5823.8502 -0.2805962
## 27 Typeu:Rooms:Distance:Bathroom   -6766.0006   5515.4419 -1.2267377
##         p.value
## 1  1.040655e-03
## 2  4.188254e-01
## 3  4.519534e-01
## 4  6.921665e-39
## 5  1.170812e-02
## 6  8.004280e-02
## 7  3.691711e-07
## 8  2.504173e-01
## 9  5.344384e-01
## 10 2.346397e-01
## 11 1.073586e-01
## 12 3.175949e-13
## 13 6.254998e-12
## 14 1.149484e-07
## 15 1.353252e-02
## 16 1.313838e-01
## 17 1.976142e-01
## 18 2.377664e-14
## 19 4.465827e-01
## 20 1.373999e-01
## 21 1.633166e-01
## 22 2.693701e-01
## 23 7.609174e-01
## 24 3.953048e-01
## 25 8.514763e-01
## 26 7.790223e-01
## 27 2.199318e-01
rbind(glance(mod17),glance(mod19))
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.4438100     0.4434014 478574.7 1086.2050       0 21 -394988.6 790021.1
## 2 0.4455557     0.4450261 477875.8  841.2844       0 27 -394945.7 789947.5
##        BIC     deviance df.residual
## 1 790201.8 6.235445e+15       27225
## 2 790177.4 6.215874e+15       27219

Improvement in mod19 base on r.squared, adj.r.squared ,deviance, AIC and BIC.

5_variables interactions

mod20 <- lm(Price~Type*Distance*Rooms*Bathroom*Bedroom2, data = house_clean)
tidy(mod20)
##                                      term    estimate    std.error
## 1                             (Intercept) 1651739.647  128981.0113
## 2                                   Typet -418833.006 1044812.1727
## 3                                   Typeu -838031.353  556957.3610
## 4                                Distance  -91729.663   10666.6232
## 5                                   Rooms -234309.878   59712.8659
## 6                                Bathroom -564195.797   73732.1048
## 7                                Bedroom2 -194602.888   60362.9576
## 8                          Typet:Distance   80383.701  100787.2013
## 9                          Typeu:Distance    2940.536   57139.3170
## 10                            Typet:Rooms  -75893.890  751899.4101
## 11                            Typeu:Rooms  425971.688  286761.3016
## 12                         Distance:Rooms   27353.467    4599.4055
## 13                         Typet:Bathroom  264978.088  690827.1483
## 14                         Typeu:Bathroom  -40276.510  525677.2145
## 15                      Distance:Bathroom   27100.675    5021.4045
## 16                         Rooms:Bathroom  369138.688   31599.1885
## 17                         Typet:Bedroom2  -22875.230  758403.8631
## 18                         Typeu:Bedroom2 -332726.024  268823.3287
## 19                      Distance:Bedroom2   19785.954    4963.1231
## 20                         Rooms:Bedroom2   72074.133   10973.3148
## 21                      Bathroom:Bedroom2   56597.359   28428.0905
## 22                   Typet:Distance:Rooms   -3856.725   78232.8292
## 23                   Typeu:Distance:Rooms  -14161.389   25388.9501
## 24                Typet:Distance:Bathroom  -46977.940   67662.6185
## 25                Typeu:Distance:Bathroom   42104.014   52067.5833
## 26                   Typet:Rooms:Bathroom  -67525.277  414527.5629
## 27                   Typeu:Rooms:Bathroom -218570.097  249424.0104
## 28                Distance:Rooms:Bathroom  -16926.380    2307.5876
## 29                Typet:Distance:Bedroom2  -38315.488   80037.6443
## 30                Typeu:Distance:Bedroom2   36558.705   31430.4382
## 31                   Typet:Rooms:Bedroom2   40057.516  145714.4692
## 32                   Typeu:Rooms:Bedroom2   12454.291   86747.1302
## 33                Distance:Rooms:Bedroom2   -7438.830     872.9480
## 34                Typet:Bathroom:Bedroom2   -6923.578  430922.0786
## 35                Typeu:Bathroom:Bedroom2  353715.783  235387.8019
## 36             Distance:Bathroom:Bedroom2   -3299.722    2099.3486
## 37                Rooms:Bathroom:Bedroom2  -40456.694    3073.4865
## 38          Typet:Distance:Rooms:Bathroom    7732.189   43297.8940
## 39          Typeu:Distance:Rooms:Bathroom    1320.284   21569.5432
## 40          Typet:Distance:Rooms:Bedroom2    6251.990   13347.4182
## 41          Typeu:Distance:Rooms:Bedroom2   -6211.043    8127.4193
## 42       Typet:Distance:Bathroom:Bedroom2   19057.655   44888.7817
## 43       Typeu:Distance:Bathroom:Bedroom2  -35278.299   26753.1730
## 44          Typet:Rooms:Bathroom:Bedroom2  -16475.944   80852.3486
## 45          Typeu:Rooms:Bathroom:Bedroom2  -42218.691   71071.7925
## 46       Distance:Rooms:Bathroom:Bedroom2    2417.957     203.9803
## 47 Typet:Distance:Rooms:Bathroom:Bedroom2   -2866.244    7533.5852
## 48 Typeu:Distance:Rooms:Bathroom:Bedroom2    6554.167    6354.0599
##       statistic      p.value
## 1   12.80606835 1.945376e-37
## 2   -0.40086919 6.885196e-01
## 3   -1.50465980 1.324232e-01
## 4   -8.59969090 8.415802e-18
## 5   -3.92394293 8.732553e-05
## 6   -7.65196924 2.044701e-14
## 7   -3.22387929 1.266158e-03
## 8    0.79755862 4.251336e-01
## 9    0.05146257 9.589573e-01
## 10  -0.10093623 9.196018e-01
## 11   1.48545737 1.374343e-01
## 12   5.94717444 2.761402e-09
## 13   0.38356641 7.013028e-01
## 14  -0.07661833 9.389277e-01
## 15   5.39703086 6.831887e-08
## 16  11.68190405 1.875305e-31
## 17  -0.03016233 9.759378e-01
## 18  -1.23771261 2.158333e-01
## 19   3.98659349 6.720369e-05
## 20   6.56812770 5.187071e-11
## 21   1.99089555 4.650236e-02
## 22  -0.04929804 9.606821e-01
## 23  -0.55777767 5.770008e-01
## 24  -0.69429680 4.875020e-01
## 25   0.80864161 4.187284e-01
## 26  -0.16289695 8.706008e-01
## 27  -0.87629934 3.808751e-01
## 28  -7.33509747 2.277499e-13
## 29  -0.47871834 6.321429e-01
## 30   1.16316243 2.447738e-01
## 31   0.27490418 7.833919e-01
## 32   0.14357006 8.858410e-01
## 33  -8.52150365 1.655217e-17
## 34  -0.01606689 9.871811e-01
## 35   1.50269377 1.329296e-01
## 36  -1.57178371 1.160124e-01
## 37 -13.16312734 1.888745e-39
## 38   0.17858118 8.582679e-01
## 39   0.06121058 9.511920e-01
## 40   0.46840445 6.394991e-01
## 41  -0.76420855 4.447496e-01
## 42   0.42455274 6.711661e-01
## 43  -1.31865851 1.872944e-01
## 44  -0.20377818 8.385284e-01
## 45  -0.59402880 5.524978e-01
## 46  11.85387573 2.468299e-32
## 47  -0.38046210 7.036054e-01
## 48   1.03149277 3.023190e-01
rbind(glance(mod19),glance(mod20))
##   r.squared adj.r.squared    sigma statistic p.value df    logLik      AIC
## 1 0.4455557     0.4450261 477875.8  841.2844       0 27 -394945.7 789947.5
## 2 0.4498955     0.4489449 476185.6  473.2662       0 48 -394838.7 789775.3
##        BIC     deviance df.residual
## 1 790177.4 6.215874e+15       27219
## 2 790177.8 6.167221e+15       27198

Basae on r.squared, adj.r.squared ,deviance, AIC and BIC, mod20 is the best model, but a few predictiors in the mod20 model is not significant.